rm(list = ls())

library(stringr)
library(ggplot2)
library(quanteda)
library(quanteda.textplots)
library(dplyr)
library(tidylo)

library(tidyr)
library(tidytext)
library(bigrquery)
library(tidyverse)
library(readxl)
library(openxlsx)

##read cleaned ZA data here
endeligdata <- read_excel("INSERT WD/cleandata_text_ZA.xlsx") ##read cleaned ZA data here
endeligdata$q18 <-  str_trim(endeligdata$q18) #trim
endeligdata <- endeligdata[!(is.na(endeligdata$q18) | endeligdata$q18=="" ), ] ##delecting NA

newdata <- endeligdata %>%
  transmute(q18,D)

newdata$q18 <-str_replace_all(newdata$q18, "[^[:alnum:]]", " ")

newdata <- newdata[!(is.na(newdata$q18) | newdata$q18==" " ), ] ##delecting NA

# word frequencies in articles
toks <- newdata$q18 %>% 
  tokens(what = "word", # 'tokenize articles
         remove_numbers = T,
         remove_punct = T,
         remove_symbols = T,
         remove_separators = T,
         remove_hyphens = T,
         remove_url = T,
         verbose = T)

quanteda_options("language_stemmer" = "english") # set stemming language to English

toks <- toks %>% 
  tokens_tolower() %>% # all words in lower English
  tokens_remove(stopwords("english")) %>% # remove Danish 'stop words'
  tokens_wordstem() # stem all words

# get actual dfm from tokens
txt.mat <- dfm(toks)


# filter out three-character words
txt.mat <- txt.mat[, str_length(colnames(txt.mat)) > 3]

# Select from main dfm using its top features
dict<- dictionary(list(race = c("racism","race","discrimin","racial","histor","histori","color","segreg","privileg","apartheid","slaveri")))  ##Making Intergroup dictionary in ZA

dict_work<- dictionary(list(effort = c("hard","effort","lazy","work","deserv","luck","chance")))  ##Making interpersonal dictionary in ZA

text<-dfm_lookup(txt.mat, dictionary = dict) ##look up number of times words from dictionary appears in a document

newdata$race <- as.vector(dfm_match(text, "race")) #adding varible to data frame

newdata$race_dum <- as.numeric(newdata$race !=0) #creating indicator varible to data frame

text<-dfm_lookup(txt.mat, dictionary = dict_work) ##look up number of times words from dictionary appears in a document


newdata$effort <- as.vector(dfm_match(text, "effort")) #adding varible to data frame

newdata$effort_dum <- as.numeric(newdata$effort !=0) #creating indicator varible to data frame

write.xlsx(newdata, "INSERT WD/dictionary_ZA.xlsx")

###Create figure SM19

visualize_all <- newdata %>%
  transmute(q18,D,race,race_dum)

visualize_all <- visualize_all[ which(visualize_all$race_dum == 1 ), ]

# word frequencies in articles
toks <- visualize_all$q18 %>% 
  tokens(what = "word", # 'tokenize articles
         remove_numbers = T,
         remove_punct = T,
         remove_symbols = T,
         remove_separators = T,
         remove_hyphens = T,
         remove_url = T,
         verbose = T)

quanteda_options("language_stemmer" = "english") # set stemming language to Danish

toks <- toks %>% 
  tokens_tolower() %>% # all words in lower case
  tokens_remove(stopwords("english")) %>% # remove English 'stop words'
  tokens_wordstem() # stem all words

# get actual dfm from tokens
txt.mat <- dfm(toks)

# check out top-appearing features in dfm
topfeatures(txt.mat)

col <- sapply(seq(0.1, 1, 0.1), function(x) adjustcolor("#1F78B4", x))

#Save figure SM19
pdf(file = "INSERT WD/figure_SM19.pdf", 
    width=3.5,
    height=3.5,
    compress=TRUE)  

textplot_wordcloud(txt.mat, adjust = 0.5, random_order = FALSE, 
                   color = col, rotation = FALSE)

dev.off()














